# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras layer that creates a self-attention mask."""

from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function

import tensorflow as tf
from official.modeling import tf_utils

# 为 自注意力 创建 MASK
@tf.keras.utils.register_keras_serializable(package='Text')
class SelfAttentionMask(tf.keras.layers.Layer):
    """Create 3D attention mask from a 2D tensor mask.
  
      inputs[0]: from_tensor: 2D or 3D Tensor of shape
        [batch_size, from_seq_length, ...].
      inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
  
      Returns:
        float Tensor of shape [batch_size, from_seq_length, to_seq_length].
    """
    
    def call(self, inputs):
        # 输入由 from_tensor 和 to_mask 组成
        from_tensor = inputs[0]
        to_mask = inputs[1]
        from_shape = tf_utils.get_shape_list(from_tensor, expected_rank=[2, 3])
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        
        to_shape = tf_utils.get_shape_list(to_mask, expected_rank=2)
        to_seq_length = to_shape[1]
        
        to_mask = tf.cast(
            tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
            dtype=from_tensor.dtype)
        
        # We don't assume that `from_tensor` is a mask (although it could be). We
        # don't actually care if we attend *from* padding tokens (only *to* padding)
        # tokens so we create a tensor of all ones.
        #
        # `broadcast_ones` = [batch_size, from_seq_length, 1]
        broadcast_ones = tf.ones(
            shape=[batch_size, from_seq_length, 1], dtype=from_tensor.dtype)
        
        # Here we broadcast along two dimensions to create the mask.
        mask = broadcast_ones * to_mask
        # 通过相乘实现 batch,from_seq,to_seq
        
        return mask